Required libraries

library(tidyverse)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(readxl)
library(fs)
library(RColorBrewer)

Loading data, modifying slightly, and combining

# the manuscripts
allplays <- dir_ls("../data", glob = "*.jsonl") %>%
    map_dfr(
      function(fn) jsonlite::stream_in(file(fn), verbose = FALSE) %>% 
      tibble() %>%
      mutate(speaker = stringr::str_remove(speaker, " \\(")) %>%
      add_column(filename = basename(fn))) %>% 
      mutate(title_year = paste(title, year))

# an excel sheet with character aliases
aliases <- read_excel("../data/Rolleliste.xlsx") %>% 
  unnest_tokens(variant, Alias, token = "regex", pattern = ", ") %>% 
  mutate(
      Karakter = tolower(Karakter),
      variant = tolower(variant))
aliases
# aliases are used to rename speakers in allplays into their main name
allplays <- allplays %>%
  mutate(speaker = tolower(speaker)) %>% 
  left_join(aliases, by = c("filename"="Filnavn", "speaker"="variant")) %>% 
  mutate(speaker = if_else(!is.na(Karakter), Karakter, speaker))

# an excel sheet indicating gender and social class of all characters
gender_social <- read_excel("../data/gender_AND_Mask_alaw_sammenlagte_karakterer_reduceret_i_kategoriantal.xlsx") %>%
  
  # change gender codes
  mutate(gender = gsub("^f$", "Female", gender)) %>%
  mutate(gender = gsub("mp", "m", gender)) %>%
  mutate(gender = gsub("^m$", "Male", gender)) %>%
  mutate(gender = gsub("mfp", "Mixture", gender))

# combine columns in gender_social
gender_social <- unite(data = gender_social, col = "social_status", "social status, main character", "social status, other characters", sep="", na.rm = TRUE) %>% 
  mutate(social_status = gsub("^NA", "", social_status)) %>% 
  mutate(social_status = gsub("NA$", "", social_status))

gender_social
# cluster social statuses not of higher interest
other <- c("Artisans","Extern","Children","Riding bailiff/Proprietor","Church officials","","Military/Law enforcement", "Performers/Artists")

gender_social <- gender_social %>% mutate(social_status = ifelse(social_status %in% other, "Other", social_status))

# combine with allplays
allplays <- allplays %>%
  mutate(docTitle = tolower(docTitle)) %>% 
  left_join(gender_social[,c(1,2,7,8)], by = c("docTitle"="play", "speaker"="speaker"))

allplays

Identifying characters that are present in the stage but silent

# all character names
names <- unique(allplays[!is.na(allplays$speaker),"speaker"])

# NAMES of characters on the stage is recorded in the stage and speaker_stage variables
# tokenize explicit entries in the stage variable
explicit_stage_tokens <- allplays %>% 
  filter(!is.na(stage)) %>%
  filter(!startsWith(stage, "("))  %>% 
  unnest_tokens(word, stage, drop=FALSE, token="regex", pattern = ", *") %>%
  select(docTitle, filename, title, act, scene, index, word) %>% 
  distinct()
  
# tokenize implicit (parenthetical) entries in the stage variable
implicit_stage_tokens <- allplays %>% 
  filter(!is.na(stage)) %>%
  filter(startsWith(stage, "("))  %>% 
  unnest_tokens(word, stage) %>%
  select(docTitle, filename, title, act, scene, index, word) %>% 
  distinct()

# tokenize speaker stage variable
speaker_stage_tokens <- allplays %>% 
  unnest_tokens(word, speaker_stage) %>%
  filter(!is.na(word)) %>%
  select(docTitle, filename, title, act, scene, index, word)
  
# search for names in tokens, i.e., keep the tokens that are names
explicit_names_in_stage <- explicit_stage_tokens %>%
  semi_join(names, by = c("word"="speaker"))
  
implicit_names_in_stage <- implicit_stage_tokens %>%
  semi_join(names, by = c("word" = "speaker"))
  
names_in_speaker_stage <- speaker_stage_tokens %>%
  semi_join(names, by = c("word" = "speaker"))

# join the above
all_people_in_stage <- explicit_names_in_stage %>%
      full_join(implicit_names_in_stage) %>% 
      full_join(names_in_speaker_stage)

# the characters who speak in each scene
speakers <- allplays %>% 
  filter(!is.na(speaker)) %>%
  select(filename, act, scene, speaker) %>%
  distinct()

# find aliases to speakers within each title, act, and scene
speakers_w_aliases <- data.frame()
  
for (i in 1:nrow(speakers)){
  speaker <- speakers$speaker[i]
  main_name <- aliases$Karakter[aliases$variant == speaker]
  alias_names <- aliases$variant[aliases$Karakter == main_name]
  n <- length(alias_names)
  new_rows <- data.frame(
    "filename"=rep(speakers$filename[i],n),
    "act"=rep(speakers$act[i],n), 
    "scene"=rep(speakers$scene[i],n),
    "speaker"=alias_names)
  
  speakers_w_aliases <- rbind(speakers_w_aliases, new_rows)
}

# add aliases to speakers
speakers <- rbind(speakers, speakers_w_aliases) %>% 
      distinct()

# filter out speakers from all people on stage within title, act, and scene
silent <- all_people_in_stage %>%
  anti_join(speakers, by=c("filename"="filename", "act"="act", "scene"="scene", "word"="speaker")) %>%
  distinct()

# aliases are used to rename
silent <- silent %>%
  left_join(aliases, by = c("filename"="Filnavn", "word"="variant")) %>% 
  mutate(speaker = if_else(!is.na(Karakter), Karakter, word))

# add gender and social class
silent <- silent %>%
  left_join(gender_social[,c(1,2,7,8)], by = c("docTitle"="play", "speaker"="speaker"))

silent

Preparing a data frame and plotting amount of silent characters

# counting silent characters by gender (mixture and NAs removed) and class
df_silent <- silent %>% count(gender, social_status) %>% filter(!gender %in% c("Mixture", NA))

# count amount of silent characters by gender only
total <- df_silent %>%
  group_by(gender) %>% summarise(total=sum(n)) %>% 
  pull(total)

df_total <- data.frame(gender = c("Female", "Male"), 
                       social_status = rep("Total",2),
                       n = total)

df_silent <- df_silent %>% full_join(df_total) %>% 
  mutate(social_status = factor(social_status))

# prepare df as above but both with silent and speaking characters
all_people_in_stage <- all_people_in_stage %>%
  left_join(aliases, by = c("filename"="Filnavn", "word"="variant")) %>% 
  mutate(speaker = if_else(!is.na(Karakter), Karakter, word))

all_people_in_stage <- all_people_in_stage %>%
  left_join(gender_social[,c(1,2,7,8)], by = c("docTitle"="play", "speaker"="speaker"))

df_all <- all_people_in_stage %>% count(gender, social_status) %>% filter(!gender %in% c("Mixture", NA))

total2 <- df_all %>%
  group_by(gender) %>% summarise(total=sum(n)) %>% 
  pull(total)

df_total2 <- data.frame(gender = c("Female", "Male"), 
                       social_status = rep("Total",2),
                       n = total2)

df_all <- df_all %>% full_join(df_total2) %>% 
  mutate(social_status = factor(social_status))

# compare df_silent with df_all
df_all <- df_all %>% semi_join(df_silent, by = c("gender"="gender", "social_status"="social_status"))

# calculate percentage of silent occurrences
df_silent$percentage <- (df_silent$n/df_all$n)*100

# calculate percentages aggregated across gender
df_silent_agg <- df_silent %>% group_by(social_status) %>% 
  summarise(gender = "Both", n = sum(n), percentage=sum(percentage))

df_silent <- full_join(df_silent, df_silent_agg) %>% mutate(gender=factor(gender, levels = c("Female", "Male", "Both")))


# plotting
ggplot(df_silent, aes(x=social_status, y=percentage, fill=social_status))+
  geom_col()+
  scale_fill_manual(values = c(rep("#02818A", 7), "#024f54")) +
  scale_y_continuous(expand = c(0,0), limits = c(0,43), breaks = seq(0,40,by=10))+
  facet_wrap(.~gender, scales = "free_x")+
  labs(x="Social class", y="Percentage of silent occurrences")+
  theme_bw()+
  theme(panel.grid.major.x = element_blank(),
        axis.text.x = element_text(angle = 40, hjust = 1),
        legend.position = "none")

# alternative plot
df_silent %>% filter(gender %in% c("Female", "Male")) %>% 

ggplot(aes(x=social_status, y=percentage, fill=gender))+
  geom_col()+
  geom_text(aes(label = paste0(round(percentage, 1),"%")),
            position = position_stack(vjust = 0.9),
            color = "white", size = 3)+
  scale_fill_manual(values = c("#d6604d", "#4393c3"))+
  scale_y_continuous(expand = c(0,0), limits = c(0,43), breaks = seq(0,40,by=10))+
  labs(x="Social class", y="Percentage of silent occurrences")+
  theme_bw()+
  theme(panel.grid.major.x = element_blank(),
        axis.text.x = element_text(angle = 40, hjust = 1),
        legend.title = element_blank())

A data frame is prepared for plotting percentages of speech

df <- allplays %>% 
  
  # remove rows that are not dialogue
  filter(act != "", scene != "", speaker != "", !is.na(speaker), !is.na(spoke)) %>%
  
  # add the number of spoken words
  mutate(n_spoken_words = str_count(spoke, '\\w+')) %>% 
  
  # organize data set by grouping
  group_by(year, title_year, act_number, scene_number, speaker, gender, social_status) %>% 
  
  # sum the words spoken by each speaker
  summarise(words = sum(n_spoken_words))

# find highest number of scenes within each act for use in the plot  
borders <- df %>% group_by(act_number) %>% 
  summarise(max_scene = max(scene_number)) %>% 
  mutate(borders = cumsum(max_scene+c(0.5,rep(0,length(act_number)-1)))) %>%
  pull(borders)

df <- df %>%   
  # calculate percentage of words spoken in each scene
  group_by(title_year) %>% 
  mutate(percent = 100*(words/sum(words))) %>% ungroup() %>% 

  # add act:scene column
  mutate(act_scene = paste0(act_number, ":", str_pad(scene_number, 2, pad = "0"))) %>% 
  
  # minor correction
  mutate(title_year = gsub("ARTAXERXES", "Artaxerxes", title_year)) %>% 

  # remove plays that are very short
  filter(!title_year %in% c("Nytårsprolog til en komedie (1723)", "Den danske komedies ligbegængelse (1746)"))

# make order of plays chronological   
play_chronology <- unique(df$title_year)
df$title_year <- factor(df$title_year, levels = play_chronology)

Coloring by gender

df %>% group_by(title_year, act_number, scene_number, act_scene, gender) %>%
  summarise(percent = sum(percent)) %>% 
  mutate(gender = gsub("NA", "Unknown", gender)) %>% 

ggplot(aes(fill = gender, y = percent, x = act_scene)) +
  geom_bar(stat="identity") +
  scale_fill_manual(values = c("#d6604d", "#4393c3", "#82817f", "black")) +
  xlab("Act:Scene") +
  ylab("Percentage of spoken words") +
  facet_grid(rows = vars("title_year" = title_year), 
             switch = "x", scales = "free_y") +
  geom_vline(xintercept = borders, size = 0.2) +
  theme_bw() +
  theme(legend.position = "bottom", 
          legend.title = element_blank(), 
          axis.text.x = element_text(angle = 90, size = 6, hjust = 0, vjust = 0.5),
          axis.text.y = element_text(size = 4),
          strip.text.y = element_text(angle = 0, hjust = 0))

Coloring by social status

df %>% group_by(title_year, act_number, scene_number, act_scene, social_status) %>%
  summarise(percent = sum(percent)) %>% 

ggplot(aes(fill = social_status, y = percent, x = act_scene)) +
  geom_bar(stat="identity") +
  scale_fill_brewer(palette = "Set3", direction = -1) +
  xlab("Act:Scene") +
  ylab("Percentage of spoken words") +
  facet_grid(rows = vars("title_year" = title_year), 
             switch = "x", scales = "free_y") +
  geom_vline(xintercept = borders, size = 0.2) +
  theme_bw() +
  theme(legend.position = "bottom", 
          legend.title = element_blank(), 
          axis.text.x = element_text(angle = 90, size = 6, hjust = 0, vjust = 0.5),
          axis.text.y = element_text(size = 4),
          strip.text.y = element_text(angle = 0, hjust = 0))